# Load necessary packages
library(dplyr)
library(ggplot2)
library(did)
library(readr)
library(purrr)

# Load necessary data

load("00_data/01_data_processed/data_epa_analysis.Rdata")

# Preparing the data for modeling

# Set treatment year for analysis
epa_only$treatment_year_reg <- ifelse(epa_only$treatment_region == 0, 0, 1995)
epa_only$treatment_year_sta <- ifelse(epa_only$treatment_state == 0, 0, 1995)

final_panel_data$treatment_year_reg <- ifelse(final_panel_data$treatment_region == 0, 0, 1995)
final_panel_data$treatment_year_sta <- ifelse(final_panel_data$treatment_state == 0, 0, 1995)

# Convert PGM_SYS_ID to a numeric identifier
epa_only$PGM_SYS_ID_num <- as.numeric(as.factor(epa_only$PGM_SYS_ID))
final_panel_data$PGM_SYS_ID_num <- as.numeric(as.factor(final_panel_data$PGM_SYS_ID))


### PLACEBO BASED INFERENCE
# Compute "real" estimate

set.seed(123)
model_EPA_REGION <- att_gt(
  yname       = "outcome_epa",
  tname       = "year",
  idname      = "PGM_SYS_ID_num",
  gname       = "treatment_year_reg",
  data        = epa_only[epa_only$year < 2002 & epa_only$year > 1989, ],
  alp         = 0.01
)

# Obtain the dynamic (event-study) estimates
real_dynamic <- aggte(model_EPA_REGION, type = "dynamic", na.rm = TRUE)
real_egt <- real_dynamic$egt
real_att_dynamic <- real_dynamic$att

# Event window
k <- length(real_att_dynamic)

# Regions and treatment

# EPA_REGION = ID for each region.
all_regions <- unique(epa_only$EPA_REGION)
n_total_regions <- length(all_regions)

# 'treatment_region'=1 if region is actually treated, 0 otherwise
treated_regions <- unique(epa_only$EPA_REGION[epa_only$treatment_region == 1])
n_treated <- length(treated_regions)

cat("Total unique regions:", n_total_regions, "\n")
cat("Number of treated regions:", n_treated, "\n")

# Permutation Loop
# We pick exactly `n_treated` from all regions
# for each permutation, label them as 'treated', then run att_gt.
# We'll store the dynamic estimates for each iteration.

set.seed(111)
n_permutations <- 1000  
placebo_att_matrix <- matrix(NA, nrow = n_permutations, ncol = k)

for (b in seq_len(n_permutations)) {
  # Randomly pick n_treated from ALL regions
  fake_treated <- sample(all_regions, size = n_treated, replace = FALSE)
  
  # Create a copy of our analysis sample
  dtemp <- epa_only[epa_only$year < 2002 & epa_only$year > 1989, ]
  
  # Define a fake gname that sets those chosen regions to 1995, else 0
  dtemp$treatment_year_reg_placebo <- ifelse(
    dtemp$EPA_REGION %in% fake_treated,
    1995,  
    0
  )
  
  # Re-run att_gt with the placebo assignment
  att_gt_fake <- att_gt(
    yname       = "outcome_epa",
    tname       = "year",
    idname      = "PGM_SYS_ID_num",
    gname       = "treatment_year_reg_placebo",
    data        = dtemp,
    alp         = 0.01)
  
  # Aggregate with type="dynamic" to get the event-study
  agg_fake_dynamic <- aggte(att_gt_fake, type = "dynamic", na.rm = TRUE)
  att_vec <- agg_fake_dynamic$att

  # Store in the matrix
  placebo_att_matrix[b, ] <- att_vec
}

# Compare real to placebo distributions

p_values_dynamic <- numeric(k)
for (j in seq_len(k)) {
  # fraction of placebo draws whose ATT for event-time j is <= real estimate
  p_values_dynamic[j] <- mean(placebo_att_matrix[, j] <= real_att_dynamic[j])
}

# Combine into a data frame:
results_df <- data.frame(
  event_time = real_egt,
  real_ATT   = real_att_dynamic,
  p_value    = p_values_dynamic
)

cat("\nDynamic ATT Randomization-Inference Results:\n")
print(results_df)

# Placebo vs. real estimates 

alpha <- 0.01

# Compute the lower/upper bounds from the placebo distribution
# for each event time (column).
lower_bounds <- apply(placebo_att_matrix, 2, quantile, probs = alpha/2, na.rm = TRUE)
upper_bounds <- apply(placebo_att_matrix, 2, quantile, probs = 1 - alpha/2, na.rm = TRUE)

# Combine into a dataframe for plotting
df_es <- data.frame(
  event_time = real_egt,
  att        = real_att_dynamic,
  ci_lower   = lower_bounds,
  ci_upper   = upper_bounds
)

df_es

placebo_means <- colMeans(placebo_att_matrix, na.rm = TRUE)
df_es$placebo_mean <- placebo_means


p1 <- ggplot(df_es, aes(x = event_time, y = att)) +
  # Line connecting the real ATT points
  geom_line(aes(color = "Identified Estimate"), linetype=2) +
  geom_point(aes(color = "Identified Estimate")) +
  # Randomization-based CIs
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper, color = "99% Permutation CB"), width = 0.2, alpha=.8) +
  geom_point(aes(y = placebo_means, color = "Permutation Mean"), shape = 17, size = 3) +
  
  scale_color_manual(
    values = c("Identified Estimate" = "black", "99% Permutation CB" = "blue", "Permutation Mean" = "blue"),
    name = "Legend"
  ) +
  geom_vline(xintercept = -1, linetype = 3) +
  geom_vline(xintercept = 4, linetype = 3) +
  geom_hline(yintercept = 0, linetype = 1) +
  ylab("Estimate") +
  xlab("Years since program start") +
  theme_bw() +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.30, .15, 0.04)) +
  coord_cartesian(ylim = c(-.25, .15))+
  theme(
    legend.background = element_rect(fill = "white", linewidth = 0.2, linetype = "solid", colour = "black"),
    legend.position = c(0.9, 0.2),
    legend.title = element_blank()
  )

p1

ggsave(filename="03_figures/appendix/figure_a10.pdf", height = 4, width = 10)

### STATE LEVEL CLUSTER

set.seed(123)
model_epa_region_st_cluster <- att_gt(
  yname = "outcome_epa",
  tname = "year",
  idname = "PGM_SYS_ID_num",
  gname = "treatment_year_reg",
  data = epa_only[epa_only$year < 2002 & epa_only$year > 1989, ],
  cluster = "STATE",
  alp = .01
)

# Aggregate treatment effects
aggte_model_epa_region_dynamic_st_cluster <- aggte(model_epa_region_st_cluster, type = "dynamic", na.rm = TRUE)
aggte_model_epa_region_simple_st_cluster <- aggte(model_epa_region_st_cluster, type = "simple", na.rm = TRUE)

# Plotting the results
plot_epa_region_dynamic_st_cluster <- ggdid(aggte_model_epa_region_dynamic_st_cluster)

p2 <- plot_epa_region_dynamic_st_cluster$data %>%
  ggplot(aes(x = year, y = att,
             ymin = (att - c * att.se),
             ymax = (att + c * att.se))) +
  geom_point() +
  geom_errorbar(width = .1) +
  geom_vline(xintercept = -1, linetype = 3) +
  geom_vline(xintercept = 4, linetype = 3) +
  geom_hline(yintercept = 0, linetype = 1) +
  ylab("Estimate, 99% conf. band") +
  xlab("Years since program start") +
  theme_bw() +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.35, .08, by = 0.04)) +
  coord_cartesian(ylim = c(-.35, .08))


p2

summary(model_epa_region_st_cluster)

ggsave(filename="03_figures/appendix/figure_a9.pdf", height = 3, width = 8)

save(aggte_model_epa_region_dynamic_st_cluster, file = "00_data/01_data_processed/st_cluster.Rdata")


rm(list = ls())
